import warnings
warnings.filterwarnings("ignore")
import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer
import re
# Tutorial about Python regular expressions: https://pymotw.com/2/re/
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle
from tqdm import tqdm
import os
import plotly
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()
from collections import Counter
# Tutorial about Python regular expressions: https://pymotw.com/2/re/
project_data = pd.read_csv('/home/shivam/Desktop/train_data.csv',nrows=20000)
resource_data = pd.read_csv('/home/shivam/Desktop/resources.csv')
project_data.head(5)
project_data['project_grade_category'].head(10)
project_data['project_grade_category'] = project_data['project_grade_category'].str.replace(' ','_')
project_data['project_grade_category'] = project_data['project_grade_category'].str.replace('-','_')
project_data['project_grade_category'] = project_data['project_grade_category'].str.lower()
project_data['project_grade_category'].value_counts()
#
project_data['project_subject_categories'] = project_data['project_subject_categories'].str.replace(' The ','')
project_data['project_subject_categories'] = project_data['project_subject_categories'].str.replace(' ','')
project_data['project_subject_categories'] = project_data['project_subject_categories'].str.replace('&','_')
project_data['project_subject_categories'] = project_data['project_subject_categories'].str.replace(',','_')
project_data['project_subject_categories'] = project_data['project_subject_categories'].str.lower()
project_data['project_subject_categories'].value_counts()
#
project_data['teacher_prefix'] = project_data['teacher_prefix'].str.replace('.','')
project_data['teacher_prefix'] = project_data['teacher_prefix'].str.lower()
project_data['teacher_prefix'].value_counts()
#
project_data['project_subject_subcategories'] = project_data['project_subject_subcategories'].str.replace(' The ','')
project_data['project_subject_subcategories'] = project_data['project_subject_subcategories'].str.replace(' ','')
project_data['project_subject_subcategories'] = project_data['project_subject_subcategories'].str.replace('&','_')
project_data['project_subject_subcategories'] = project_data['project_subject_subcategories'].str.replace(',','_')
project_data['project_subject_subcategories'] = project_data['project_subject_subcategories'].str.lower()
project_data['project_subject_subcategories'].value_counts()
#
project_data['school_state'] = project_data['school_state'].str.lower()
project_data['school_state'].value_counts()
def decontracted(phrase):
# specific
phrase = re.sub(r"won't", "will not", phrase)
phrase = re.sub(r"can\'t", "can not", phrase)
# general
phrase = re.sub(r"n\'t", " not", phrase)
phrase = re.sub(r"\'re", " are", phrase)
phrase = re.sub(r"\'s", " is", phrase)
phrase = re.sub(r"\'d", " would", phrase)
phrase = re.sub(r"\'ll", " will", phrase)
phrase = re.sub(r"\'t", " not", phrase)
phrase = re.sub(r"\'ve", " have", phrase)
phrase = re.sub(r"\'m", " am", phrase)
return phrase
stopwords= ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
"you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
"hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
"mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
'won', "won't", 'wouldn', "wouldn't"]
from tqdm import tqdm
def preprocess_text(text_data):
preprocessed_text = []
# tqdm is for printing the status bar
for sentance in tqdm(text_data):
sent = decontracted(sentance)
sent = sent.replace('\\r', ' ')
sent = sent.replace('\\n', ' ')
sent = sent.replace('\\"', ' ')
sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
# https://gist.github.com/sebleier/554280
sent = ' '.join(e for e in sent.split() if e.lower() not in stopwords)
preprocessed_text.append(sent.lower().strip())
return preprocessed_text
preprocessed_titles = preprocess_text(project_data['project_title'].values)
project_data['project_title']=preprocessed_titles
project_data["essay"] = project_data["project_essay_1"].map(str) +\
project_data["project_essay_2"].map(str) + \
project_data["project_essay_3"].map(str) + \
project_data["project_essay_4"].map(str)
preprocessed_essays = preprocess_text(project_data['essay'].values)
project_data['essay']=preprocessed_essays
price_data = resource_data.groupby('id').agg({'price':'sum', 'quantity':'sum'}).reset_index()
project_data = pd.merge(project_data, price_data, on='id', how='left')
project_data['price'].head()
project_data.columns
data = project_data.drop(['project_is_approved','Unnamed: 0','project_essay_1', 'project_essay_2',
'project_essay_3', 'project_essay_4','teacher_id','project_submitted_datetime'], axis=1)
y = project_data['project_is_approved'].values
X = data
X.head()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train)
print(X_train.shape, y_train.shape)
print(X_cv.shape, y_cv.shape)
print(X_test.shape, y_test.shape)
(classes, counts) = np.unique(y_train, return_counts=True)
print(50*'*','Training lables',50*'*')
print('class 0','{0:.2f}'.format(counts[0]/(counts[0]+counts[1])*100))
print('class 1','{0:.2f}'.format(counts[1]/(counts[0]+counts[1])*100))
print(50*'*','CV lables',50*'*')
(classes, counts) = np.unique(y_cv, return_counts=True)
print('class 0','{0:.2f}'.format(counts[0]/(counts[0]+counts[1])*100))
print('class 1','{0:.2f}'.format(counts[1]/(counts[0]+counts[1])*100))
print(50*'*','Test lables',50*'*')
(classes, counts) = np.unique(y_test, return_counts=True)
print('class 0','{0:.2f}'.format(counts[0]/(counts[0]+counts[1])*100))
print('class 1','{0:.2f}'.format(counts[1]/(counts[0]+counts[1])*100))
vectorizer = CountVectorizer()
vectorizer.fit(X_train['school_state'].values)
# we use the fitted CountVectorizer to convert the text to vector
X_train_state_ohe = vectorizer.transform(X_train['school_state'].values)
X_cv_state_ohe = vectorizer.transform(X_cv['school_state'].values)
X_test_state_ohe = vectorizer.transform(X_test['school_state'].values)
print("After vectorizations")
print(X_train_state_ohe.shape, y_train.shape)
print(X_cv_state_ohe.shape, y_cv.shape)
print(X_test_state_ohe.shape, y_test.shape)
print(vectorizer.get_feature_names())
print("="*100)
X_train['teacher_prefix']=X_train['teacher_prefix'].fillna('Mrs.')
X_cv['teacher_prefix']=X_cv['teacher_prefix'].fillna('Mrs.')
X_test['tearcher_prefix']=X_test['teacher_prefix'].fillna('Mrs.')
vectorizer = CountVectorizer()
vectorizer.fit(X_train['teacher_prefix'].values)
# we use the fitted CountVectorizer to convert the text to vector
X_train_teacher_ohe = vectorizer.transform(X_train['teacher_prefix'].values)
X_cv_teacher_ohe = vectorizer.transform(X_cv['teacher_prefix'].values)
X_test_teacher_ohe = vectorizer.transform(X_test['teacher_prefix'].values)
print("After vectorizations")
print(X_train_teacher_ohe.shape, y_train.shape)
print(X_cv_teacher_ohe.shape, y_cv.shape)
print(X_test_teacher_ohe.shape, y_test.shape)
print(vectorizer.get_feature_names())
print("="*100)
vectorizer = CountVectorizer()
vectorizer.fit(X_train['project_grade_category'].values) # fit has to happen only on train data
# we use the fitted CountVectorizer to convert the text to vector
X_train_grade_ohe = vectorizer.transform(X_train['project_grade_category'].values)
X_cv_grade_ohe = vectorizer.transform(X_cv['project_grade_category'].values)
X_test_grade_ohe = vectorizer.transform(X_test['project_grade_category'].values)
print("After vectorizations")
print(X_train_grade_ohe.shape, y_train.shape)
print(X_cv_grade_ohe.shape, y_cv.shape)
print(X_test_grade_ohe.shape, y_test.shape)
print(vectorizer.get_feature_names())
print("="*100)
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
# normalizer.fit(X_train['price'].values)
# this will rise an error Expected 2D array, got 1D array instead:
# array=[105.22 215.96 96.01 ... 368.98 80.53 709.67].
# Reshape your data either using
# array.reshape(-1, 1) if your data has a single feature
# array.reshape(1, -1) if it contains a single sample.
normalizer.fit(X_train['price'].values.reshape(-1,1))
X_train_price_norm = normalizer.transform(X_train['price'].values.reshape(-1,1))
X_cv_price_norm = normalizer.transform(X_cv['price'].values.reshape(-1,1))
X_test_price_norm = normalizer.transform(X_test['price'].values.reshape(-1,1))
print("After vectorizations")
print(X_train_price_norm.shape, y_train.shape)
print(X_cv_price_norm.shape, y_cv.shape)
print(X_test_price_norm.shape, y_test.shape)
print("="*100)
project_data.head()
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer_tfidf = TfidfVectorizer(min_df=10,ngram_range=(1,2))
vectorizer_tfidf.fit(X_train['essay'].values)
X_train_essay_tfidf = vectorizer_tfidf.transform(X_train['essay'].values)
X_cv_essay_tfidf = vectorizer_tfidf.transform(X_cv['essay'].values)
X_test_essay_tfidf = vectorizer_tfidf.transform(X_test['essay'].values)
print("After vectorizations")
print(X_train_essay_tfidf.shape, y_train.shape)
print(X_cv_essay_tfidf.shape, y_cv.shape)
print(X_test_essay_tfidf.shape, y_test.shape)
print("="*100)
vectorizer_tfidf = TfidfVectorizer(min_df=10,ngram_range=(1,2))
vectorizer_tfidf.fit(X_train['project_title'].values)
X_train_project_title_tfidf = vectorizer_tfidf.transform(X_train['project_title'].values)
X_cv_project_title_tfidf = vectorizer_tfidf.transform(X_cv['project_title'].values)
X_test_project_title_tfidf = vectorizer_tfidf.transform(X_test['project_title'].values)
print("After vectorizations")
print(X_train_project_title_tfidf.shape, y_train.shape)
print(X_cv_project_title_tfidf.shape, y_cv.shape)
print(X_test_project_title_tfidf.shape, y_test.shape)
print("="*100)
from scipy.sparse import hstack
X_tr = hstack((X_train_essay_tfidf,X_train_project_title_tfidf, X_train_state_ohe, X_train_teacher_ohe, X_train_grade_ohe,X_train_price_norm)).tocsr()
X_cr = hstack((X_cv_essay_tfidf,X_cv_project_title_tfidf, X_cv_state_ohe, X_cv_teacher_ohe, X_cv_grade_ohe, X_cv_price_norm)).tocsr()
X_te = hstack((X_test_essay_tfidf,X_test_project_title_tfidf, X_test_state_ohe, X_test_teacher_ohe, X_test_grade_ohe, X_test_price_norm)).tocsr()
print("Final Data matrix")
print(X_tr.shape, y_train.shape)
print(X_cr.shape, y_cv.shape)
print(X_te.shape, y_test.shape)
print("="*100)
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
param_dist = {"max_depth": [1,5,10,50],
"min_samples_leaf": [5,10,100,500],
"criterion": ["gini", "entropy"]}
tree=DecisionTreeClassifier(class_weight = 'balanced')
clf = RandomizedSearchCV(tree, param_dist, cv=3, scoring='roc_auc',return_train_score=True)
clf.fit(X_tr, y_train)
results = pd.DataFrame.from_dict(clf.cv_results_)
results.columns
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()
import numpy as np
x1=results.param_min_samples_leaf
y1=results.param_max_depth
z1=results.mean_train_score
x2=results.param_min_samples_leaf
y2=results.param_max_depth
z2=results.mean_test_score
trace1 = go.Scatter3d(x=x1,y=y1,z=z1, name = 'train')
trace2 = go.Scatter3d(x=x2,y=y2,z=z2, name = 'Cross validation')
data = [trace1,trace2]
layout = go.Layout(scene = dict(
xaxis = dict(title='n_estimators'),
yaxis = dict(title='max_depth'),
zaxis = dict(title='AUC'),))
fig = go.Figure(data=data, layout=layout)
offline.iplot(fig, filename='3d-scatter-colorscale')
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
param_grid = {"max_depth": [1,5,10,50],
"min_samples_leaf": [5,10,100,500],
"criterion": ["gini", "entropy"]}
tree=DecisionTreeClassifier(class_weight = 'balanced')
clf=GridSearchCV(tree,param_grid,cv=3,scoring='roc_auc')
clf.fit(X_tr, y_train)
print("Best cross-validation score: {:.2f}".format(clf.best_score_))
print("Best parameters: ", clf.best_params_)
results = pd.DataFrame.from_dict(clf.cv_results_)
results.columns
import pandas as pd
pvt = pd.pivot_table(pd.DataFrame(clf.cv_results_),
values='mean_test_score', index='param_max_depth', columns='param_min_samples_leaf')
fig, ax = plt.subplots(figsize=(10,10))
ax = sns.heatmap(pvt,annot=True,fmt='.4g')
tree=DecisionTreeClassifier(max_depth=10,min_samples_leaf= 500)
from sklearn.metrics import roc_curve, auc
tree.fit(X_tr, y_train)
# roc_auc_score(y_true, y_score) the 2nd parameter should be probability estimates of the positive class
# not the predicted outputs
y_train_pred = tree.predict_proba(X_tr)[:,1]
y_test_pred = tree.predict_proba(X_te)[:,1]
train_fpr, train_tpr, tr_thresholds = roc_curve(y_train, y_train_pred)
test_fpr, test_tpr, te_thresholds = roc_curve(y_test, y_test_pred)
plt.plot(train_fpr, train_tpr, label="train AUC ="+str(auc(train_fpr, train_tpr)))
plt.plot(test_fpr, test_tpr, label="test AUC ="+str(auc(test_fpr, test_tpr)))
plt.legend()
plt.xlabel("a: hyperparameter")
plt.ylabel("AUC")
plt.title("ERROR PLOTS")
plt.grid()
plt.show()
test_auc_tfidf=auc(test_fpr, test_tpr)
print('{0:.2f}'.format(test_auc_tfidf*100))
def find_best_threshold(threshould, fpr, tpr):
t = threshould[np.argmax(tpr*(1-fpr))]
# (tpr*(1-fpr)) will be maximum if your fpr is very low and tpr is very high
print("the maximum value of tpr*(1-fpr)", max(tpr*(1-fpr)), "for threshold", np.round(t,3))
return t
def predict_with_best_t(proba, threshould):
predictions = []
for i in proba:
if i>=threshould:
predictions.append(1)
else:
predictions.append(0)
return predictions
import seaborn as sns; sns.set()
from sklearn.metrics import confusion_matrix
best_t = find_best_threshold(tr_thresholds, train_fpr, train_tpr)
con_m_train = confusion_matrix(y_train, predict_with_best_t(y_train_pred, best_t))
con_m_test = (confusion_matrix(y_test, predict_with_best_t(y_test_pred, best_t)))
key = (np.asarray([['TN','FP'], ['FN', 'TP']]))
fig, ax = plt.subplots(1,2, figsize=(15,5))
labels_train = (np.asarray(["{0} = {1:.2f}" .format(key, value) for key, value in zip(key.flatten(), con_m_train.flatten())])).reshape(2,2)
labels_test = (np.asarray(["{0} = {1:.2f}" .format(key, value) for key, value in zip(key.flatten(),con_m_test.flatten())])).reshape(2,2)
sns.heatmap(con_m_train, linewidths=.5, xticklabels=['PREDICTED : NO', 'PREDICTED : YES'],yticklabels=['ACTUAL : NO', 'ACTUAL : YES'], annot = labels_train, fmt = '', ax=ax[0])
sns.heatmap(con_m_test, linewidths=.5, xticklabels=['PREDICTED : NO', 'PREDICTED : YES'],yticklabels=['ACTUAL : NO', 'ACTUAL : YES'], annot = labels_test, fmt = '', ax=ax[1])
ax[0].set_title('Train Set')
ax[1].set_title('Test Set')
plt.show()
p=predict_with_best_t(y_test_pred, best_t)
text=[]
for i in range(len(p)):
if (p[i]==1 and y_test[i]==0):
text.append(project_data['essay'][i])
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import pandas as pd
comment_words = ' '
stopwords = set(STOPWORDS)
for val in text:
# typecaste each val to string
val = str(val)
# split the value
tokens = val.split()
# Converts each token into lowercase
for i in range(len(tokens)):
tokens[i] = tokens[i].lower()
for words in tokens:
comment_words = comment_words + words+' '
wordcloud = WordCloud(width = 800, height = 800,
background_color ='white',
stopwords = stopwords,
min_font_size = 10).generate(comment_words)
# plot the WordCloud image
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
price=[]
for i in range(len(p)):
if (p[i]==1 and y_test[i]==0):
price.append(project_data['price'][i])
import seaborn as sns
sns.set(style="whitegrid")
ax = sns.boxplot(x=price)
previously_posted_projects=[]
for i in range(len(p)):
if (p[i]==1 and y_test[i]==0):
previously_posted_projects.append(project_data['teacher_number_of_previously_posted_projects'][i])
counts, bin_edges = np.histogram(previously_posted_projects, bins=10, density = True)
pdf = counts/(sum(counts))
print(pdf)
plt.grid()
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:],pdf)
plt.plot(bin_edges[1:], cdf)
def loadGloveModel(gloveFile):
print ("Loading Glove Model")
f = open(gloveFile,'r', encoding = 'utf8')
model = {}
for line in tqdm(f):
splitLine = line.split()
word = splitLine[0]
embedding = np.array([float(val) for val in splitLine[1:]])
model[word] = embedding
print ("Done.",len(model)," words loaded!")
return model
model = loadGloveModel('/home/shivam/Desktop/assignment 11/glove.6B.50d.txt')
glove_words = set(model.keys())
# average Word2Vec
# compute average word2vec for each review.
def func(wordlist):
train_avg_w2v_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(wordlist): # for each review/sentence
vector = np.zeros(50) # as word vectors are of zero length # we are taking the 300dimensions very large
cnt_words =0; # num of words with a valid vector in the sentence/review
for word in sentence.split(): # for each word in a review/sentence
if word in glove_words:
vector += model[word]
cnt_words += 1
if cnt_words != 0:
vector /= cnt_words
train_avg_w2v_vectors.append(vector)
print(len(train_avg_w2v_vectors))
print(len(train_avg_w2v_vectors[0]))
return train_avg_w2v_vectors
train_avg_w2v_vectors_essay=func(X_train['essay'])
test_avg_w2v_vectors_essay=func(X_test['essay'])
cv_avg_w2v_vectors_essay=func(X_cv['essay'])
#for titles
cv_avg_w2v_vectors_project_title=func(X_cv['project_title'])
test_avg_w2v_vectors_project_title=func(X_test['project_title'])
train_avg_w2v_vectors_project_title=func(X_train['project_title'])
from scipy.sparse import hstack
X_tr_w2v = hstack((train_avg_w2v_vectors_essay,train_avg_w2v_vectors_project_title, X_train_state_ohe, X_train_teacher_ohe, X_train_grade_ohe,X_train_price_norm)).tocsr()
X_cr_w2v = hstack((cv_avg_w2v_vectors_essay,cv_avg_w2v_vectors_project_title, X_cv_state_ohe, X_cv_teacher_ohe, X_cv_grade_ohe, X_cv_price_norm)).tocsr()
X_te_w2v = hstack((test_avg_w2v_vectors_essay,test_avg_w2v_vectors_project_title, X_test_state_ohe, X_test_teacher_ohe, X_test_grade_ohe, X_test_price_norm)).tocsr()
print("Final Data matrix")
print(X_tr_w2v.shape, y_train.shape)
print(X_cr_w2v.shape, y_cv.shape)
print(X_te_w2v.shape, y_test.shape)
print("="*100)
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
param_grid = {"max_depth": [1,5,10,50],
"min_samples_leaf": [5,10,100,500],
"criterion": ["gini", "entropy"]}
tree=DecisionTreeClassifier(class_weight = 'balanced')
clf=GridSearchCV(tree,param_grid,cv=3,scoring='roc_auc')
clf.fit(X_tr_w2v, y_train)
print("Best cross-validation score: {:.2f}".format(clf.best_score_))
print("Best parameters: ", clf.best_params_)
results = pd.DataFrame.from_dict(clf.cv_results_)
results.columns
import pandas as pd
pvt = pd.pivot_table(pd.DataFrame(clf.cv_results_),
values='mean_test_score', index='param_max_depth', columns='param_min_samples_leaf')
fig, ax = plt.subplots(figsize=(10,10))
ax = sns.heatmap(pvt,annot=True,fmt='.4g')
tree=DecisionTreeClassifier(max_depth=5,min_samples_leaf= 500)
from sklearn.metrics import roc_curve, auc
tree.fit(X_tr_w2v, y_train)
# roc_auc_score(y_true, y_score) the 2nd parameter should be probability estimates of the positive class
# not the predicted outputs
y_train_pred = tree.predict_proba(X_tr_w2v)[:,1]
y_test_pred = tree.predict_proba(X_te_w2v)[:,1]
train_fpr, train_tpr, tr_thresholds = roc_curve(y_train, y_train_pred)
test_fpr, test_tpr, te_thresholds = roc_curve(y_test, y_test_pred)
plt.plot(train_fpr, train_tpr, label="train AUC ="+str(auc(train_fpr, train_tpr)))
plt.plot(test_fpr, test_tpr, label="test AUC ="+str(auc(test_fpr, test_tpr)))
plt.legend()
plt.xlabel("a: hyperparameter")
plt.ylabel("AUC")
plt.title("ERROR PLOTS")
plt.grid()
plt.show()
test_auc_avg_w2v=auc(test_fpr, test_tpr)
print('{0:.2f}'.format(test_auc_avg_w2v*100))
def find_best_threshold(threshould, fpr, tpr):
t = threshould[np.argmax(tpr*(1-fpr))]
# (tpr*(1-fpr)) will be maximum if your fpr is very low and tpr is very high
print("the maximum value of tpr*(1-fpr)", max(tpr*(1-fpr)), "for threshold", np.round(t,3))
return t
def predict_with_best_t(proba, threshould):
predictions = []
for i in proba:
if i>=threshould:
predictions.append(1)
else:
predictions.append(0)
return predictions
import seaborn as sns; sns.set()
from sklearn.metrics import confusion_matrix
best_t = find_best_threshold(tr_thresholds, train_fpr, train_tpr)
con_m_train = confusion_matrix(y_train, predict_with_best_t(y_train_pred, best_t))
con_m_test = (confusion_matrix(y_test, predict_with_best_t(y_test_pred, best_t)))
key = (np.asarray([['TN','FP'], ['FN', 'TP']]))
fig, ax = plt.subplots(1,2, figsize=(15,5))
labels_train = (np.asarray(["{0} = {1:.2f}" .format(key, value) for key, value in zip(key.flatten(), con_m_train.flatten())])).reshape(2,2)
labels_test = (np.asarray(["{0} = {1:.2f}" .format(key, value) for key, value in zip(key.flatten(),con_m_test.flatten())])).reshape(2,2)
sns.heatmap(con_m_train, linewidths=.9, xticklabels=['PREDICTED : NO', 'PREDICTED : YES'],yticklabels=['ACTUAL : NO', 'ACTUAL : YES'], annot = labels_train, fmt = '', ax=ax[0])
sns.heatmap(con_m_test, linewidths=.5, xticklabels=['PREDICTED : NO', 'PREDICTED : YES'],yticklabels=['ACTUAL : NO', 'ACTUAL : YES'], annot = labels_test, fmt = '', ax=ax[1])
ax[0].set_title('Train Set')
ax[1].set_title('Test Set')
plt.show()
#For essay
tfidf_model_essay = TfidfVectorizer()
tfidf_model_essay.fit(X_train['essay'])
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(tfidf_model_essay.get_feature_names(), list(tfidf_model_essay.idf_)))
tfidf_words_essay = set(tfidf_model_essay.get_feature_names())
def tf_idf_w2v(sent_list):
train_title_tfidf_w2v_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(sent_list): # for each review/sentence
vector = np.zeros(50) # as word vectors are of zero length
tf_idf_weight =0; # num of words with a valid vector in the sentence/review
for word in sentence.split():#.split(): # for each word in a review/sentence
if (word in glove_words) and (word in tfidf_words_essay):
#vec = model.wv[word]
vec = model[word] # getting the vector for each word
# here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split()))
vector += (vec * tf_idf) # calculating tfidf weighted w2v
tf_idf_weight += tf_idf
if tf_idf_weight != 0:
vector /= tf_idf_weight
train_title_tfidf_w2v_vectors.append(vector)
print(len(train_title_tfidf_w2v_vectors))
print(len(train_title_tfidf_w2v_vectors[0]))
return train_title_tfidf_w2v_vectors
#For essay
X_train_tfidf_w2v_essay=tf_idf_w2v(X_train['essay'])
X_test_tfidf_w2v_essay=tf_idf_w2v(X_test['essay'])
X_cv_tfidf_w2v_essay=tf_idf_w2v(X_cv['essay'])
#for titles
tfidf_model_project_title = TfidfVectorizer()
tfidf_model_project_title.fit(X_train['project_title'])
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(tfidf_model_project_title.get_feature_names(), list(tfidf_model_project_title.idf_)))
tfidf_words_project_title = set(tfidf_model_project_title.get_feature_names())
def tf_idf_w2v(sent_list):
train_title_tfidf_w2v_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(sent_list): # for each review/sentence
vector = np.zeros(50) # as word vectors are of zero length
tf_idf_weight =0; # num of words with a valid vector in the sentence/review
for word in sentence.split():#.split(): # for each word in a review/sentence
if (word in glove_words) and (word in tfidf_words_project_title):
#vec = model.wv[word]
vec = model[word] # getting the vector for each word
# here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split()))
vector += (vec * tf_idf) # calculating tfidf weighted w2v
tf_idf_weight += tf_idf
if tf_idf_weight != 0:
vector /= tf_idf_weight
train_title_tfidf_w2v_vectors.append(vector)
print(len(train_title_tfidf_w2v_vectors))
print(len(train_title_tfidf_w2v_vectors[0]))
return train_title_tfidf_w2v_vectors
X_train_tfidf_w2v_project=tf_idf_w2v(X_train['project_title'])
X_test_tfidf_w2v_project=tf_idf_w2v(X_test['project_title'])
X_cv_tfidf_w2v_project=tf_idf_w2v(X_cv['project_title'])
from scipy.sparse import hstack
X_tr_tfidf_w2v = hstack((X_train_tfidf_w2v_essay,X_train_tfidf_w2v_project, X_train_state_ohe, X_train_teacher_ohe, X_train_grade_ohe,X_train_price_norm)).tocsr()
X_cr_tfidf_w2v = hstack((X_cv_tfidf_w2v_essay,X_cv_tfidf_w2v_project, X_cv_state_ohe, X_cv_teacher_ohe, X_cv_grade_ohe, X_cv_price_norm)).tocsr()
X_te_tfidf_w2v = hstack((X_test_tfidf_w2v_essay,X_test_tfidf_w2v_project, X_test_state_ohe, X_test_teacher_ohe, X_test_grade_ohe, X_test_price_norm)).tocsr()
print("Final Data matrix")
print(X_tr_tfidf_w2v.shape, y_train.shape)
print(X_cr_tfidf_w2v.shape, y_cv.shape)
print(X_te_tfidf_w2v.shape, y_test.shape)
print("="*100)
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
param_grid = {"max_depth": [1,5,10,50],
"min_samples_leaf": [5,10,100,500],
"criterion": ["gini", "entropy"]}
tree=DecisionTreeClassifier(class_weight = 'balanced')
clf=GridSearchCV(tree,param_grid,cv=3,scoring='roc_auc')
clf.fit(X_tr_tfidf_w2v, y_train)
print("Best cross-validation score: {:.2f}".format(clf.best_score_))
print("Best parameters: ", clf.best_params_)
results = pd.DataFrame.from_dict(clf.cv_results_)
results.columns
import pandas as pd
pvt = pd.pivot_table(pd.DataFrame(clf.cv_results_),
values='mean_test_score', index='param_max_depth', columns='param_min_samples_leaf')
fig, ax = plt.subplots(figsize=(10,10))
ax = sns.heatmap(pvt,annot=True,fmt='.4g')
tree=DecisionTreeClassifier(max_depth=10,min_samples_leaf= 500)
from sklearn.metrics import roc_curve, auc
tree.fit(X_tr_tfidf_w2v, y_train)
# roc_auc_score(y_true, y_score) the 2nd parameter should be probability estimates of the positive class
# not the predicted outputs
y_train_pred = tree.predict_proba(X_tr_tfidf_w2v)[:,1]
y_test_pred = tree.predict_proba(X_te_tfidf_w2v)[:,1]
train_fpr, train_tpr, tr_thresholds = roc_curve(y_train, y_train_pred)
test_fpr, test_tpr, te_thresholds = roc_curve(y_test, y_test_pred)
plt.plot(train_fpr, train_tpr, label="train AUC ="+str(auc(train_fpr, train_tpr)))
plt.plot(test_fpr, test_tpr, label="test AUC ="+str(auc(test_fpr, test_tpr)))
plt.legend()
plt.xlabel("a: hyperparameter")
plt.ylabel("AUC")
plt.title("ERROR PLOTS")
plt.grid()
plt.show()
test_auc_tfidf_w2v=auc(test_fpr, test_tpr)
print('{0:.2f}'.format(test_auc_tfidf_w2v*100))
def find_best_threshold(threshould, fpr, tpr):
t = threshould[np.argmax(tpr*(1-fpr))]
# (tpr*(1-fpr)) will be maximum if your fpr is very low and tpr is very high
print("the maximum value of tpr*(1-fpr)", max(tpr*(1-fpr)), "for threshold", np.round(t,3))
return t
def predict_with_best_t(proba, threshould):
predictions = []
for i in proba:
if i>=threshould:
predictions.append(1)
else:
predictions.append(0)
return predictions
import seaborn as sns; sns.set()
from sklearn.metrics import confusion_matrix
best_t = find_best_threshold(tr_thresholds, train_fpr, train_tpr)
con_m_train = confusion_matrix(y_train, predict_with_best_t(y_train_pred, best_t))
con_m_test = (confusion_matrix(y_test, predict_with_best_t(y_test_pred, best_t)))
key = (np.asarray([['TN','FP'], ['FN', 'TP']]))
fig, ax = plt.subplots(1,2, figsize=(15,5))
labels_train = (np.asarray(["{0} = {1:.2f}" .format(key, value) for key, value in zip(key.flatten(), con_m_train.flatten())])).reshape(2,2)
labels_test = (np.asarray(["{0} = {1:.2f}" .format(key, value) for key, value in zip(key.flatten(),con_m_test.flatten())])).reshape(2,2)
sns.heatmap(con_m_train, linewidths=.5, xticklabels=['PREDICTED : NO', 'PREDICTED : YES'],yticklabels=['ACTUAL : NO', 'ACTUAL : YES'], annot = labels_train, fmt = '', ax=ax[0])
sns.heatmap(con_m_test, linewidths=.5, xticklabels=['PREDICTED : NO', 'PREDICTED : YES'],yticklabels=['ACTUAL : NO', 'ACTUAL : YES'], annot = labels_test, fmt = '', ax=ax[1])
ax[0].set_title('Train Set')
ax[1].set_title('Test Set')
plt.show()
from scipy.sparse import hstack
X_tr = hstack((X_train_essay_tfidf,X_train_project_title_tfidf, X_train_state_ohe, X_train_teacher_ohe, X_train_grade_ohe,X_train_price_norm)).tocsr()
X_cr = hstack((X_cv_essay_tfidf,X_cv_project_title_tfidf, X_cv_state_ohe, X_cv_teacher_ohe, X_cv_grade_ohe, X_cv_price_norm)).tocsr()
X_te = hstack((X_test_essay_tfidf,X_test_project_title_tfidf, X_test_state_ohe, X_test_teacher_ohe, X_test_grade_ohe, X_test_price_norm)).tocsr()
print("Final Data matrix")
print(X_tr.shape, y_train.shape)
print(X_cr.shape, y_cv.shape)
print(X_te.shape, y_test.shape)
print("="*100)
from sklearn.model_selection import GridSearchCV
param_grid = {"max_depth": [1,5,10,50],
"min_samples_leaf": [5,10,100,500],
"criterion": ["gini", "entropy"]}
clf=GridSearchCV(tree,param_grid,cv=3,scoring='roc_auc')
clf.fit(X_tr, y_train)
print("Best cross-validation score: {:.2f}".format(clf.best_score_))
print("Best parameters: ", clf.best_params_)
results = pd.DataFrame.from_dict(clf.cv_results_)
results.columns
import pandas as pd
pvt = pd.pivot_table(pd.DataFrame(clf.cv_results_),
values='mean_test_score', index='param_max_depth', columns='param_min_samples_leaf')
from sklearn.tree import DecisionTreeClassifier
tree=DecisionTreeClassifier(max_depth=5,min_samples_leaf= 500,class_weight='balanced')
from sklearn.metrics import roc_curve, auc
tree.fit(X_tr, y_train)
feature_impce = tree.feature_importances_
relevant_features = []
cols = []
for i, val in enumerate(feature_impce): #dtc2 is optimal decision tree model for set 2.
if val > 0:
relevant_features.append(feature_impce[i])
cols.append(i)
X_tr_new = X_tr.todense()[:, cols]
X_test_new = X_te.todense()[:, cols]
from sklearn.model_selection import GridSearchCV
param_grid = {"max_depth": [1,5,10,50],
"min_samples_leaf": [5,10,100,500],
"criterion": ["gini", "entropy"]}
clf=GridSearchCV(tree,param_grid,cv=3,scoring='roc_auc')
clf.fit(X_tr_new, y_train)
print("Best cross-validation score: {:.2f}".format(clf.best_score_))
print("Best parameters: ", clf.best_params_)
results = pd.DataFrame.from_dict(clf.cv_results_)
results.columns
import pandas as pd
pvt = pd.pivot_table(pd.DataFrame(clf.cv_results_),
values='mean_test_score', index='param_max_depth', columns='param_min_samples_leaf')
tree=DecisionTreeClassifier(max_depth=10,min_samples_leaf= 500)
from sklearn.metrics import roc_curve, auc
tree.fit(X_tr_new, y_train)
# roc_auc_score(y_true, y_score) the 2nd parameter should be probability estimates of the positive class
# not the predicted outputs
y_train_pred = tree.predict_proba(X_tr_new)[:,1]
y_test_pred = tree.predict_proba(X_test_new)[:,1]
train_fpr, train_tpr, tr_thresholds = roc_curve(y_train, y_train_pred)
test_fpr, test_tpr, te_thresholds = roc_curve(y_test, y_test_pred)
plt.plot(train_fpr, train_tpr, label="train AUC ="+str(auc(train_fpr, train_tpr)))
plt.plot(test_fpr, test_tpr, label="test AUC ="+str(auc(test_fpr, test_tpr)))
plt.legend()
plt.xlabel("a: hyperparameter")
plt.ylabel("AUC")
plt.title("ERROR PLOTS")
plt.grid()
plt.show()
test_auc_feature_imp=auc(test_fpr, test_tpr)
print('{0:.2f}'.format(test_auc_feature_imp*100))
def find_best_threshold(threshould, fpr, tpr):
t = threshould[np.argmax(tpr*(1-fpr))]
# (tpr*(1-fpr)) will be maximum if your fpr is very low and tpr is very high
print("the maximum value of tpr*(1-fpr)", max(tpr*(1-fpr)), "for threshold", np.round(t,3))
return t
def predict_with_best_t(proba, threshould):
predictions = []
for i in proba:
if i>=threshould:
predictions.append(1)
else:
predictions.append(0)
return predictions
import seaborn as sns; sns.set()
from sklearn.metrics import confusion_matrix
best_t = find_best_threshold(tr_thresholds, train_fpr, train_tpr)
con_m_train = confusion_matrix(y_train, predict_with_best_t(y_train_pred, best_t))
con_m_test = (confusion_matrix(y_test, predict_with_best_t(y_test_pred, best_t)))
key = (np.asarray([['TN','FP'], ['FN', 'TP']]))
fig, ax = plt.subplots(1,2, figsize=(15,5))
labels_train = (np.asarray(["{0} = {1:.2f}" .format(key, value) for key, value in zip(key.flatten(), con_m_train.flatten())])).reshape(2,2)
labels_test = (np.asarray(["{0} = {1:.2f}" .format(key, value) for key, value in zip(key.flatten(),con_m_test.flatten())])).reshape(2,2)
sns.heatmap(con_m_train, linewidths=.5, xticklabels=['PREDICTED : NO', 'PREDICTED : YES'],yticklabels=['ACTUAL : NO', 'ACTUAL : YES'], annot = labels_train, fmt = '', ax=ax[0])
sns.heatmap(con_m_test, linewidths=.5, xticklabels=['PREDICTED : NO', 'PREDICTED : YES'],yticklabels=['ACTUAL : NO', 'ACTUAL : YES'], annot = labels_test, fmt = '', ax=ax[1])
ax[0].set_title('Train Set')
ax[1].set_title('Test Set')
plt.show()
p=predict_with_best_t(y_test_pred, best_t)
text=[]
for i in range(len(p)):
if (p[i]==1 and y_test[i]==0):
text.append(project_data['essay'][i])
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import pandas as pd
comment_words = ' '
stopwords = set(STOPWORDS)
for val in text:
# typecaste each val to string
val = str(val)
# split the value
tokens = val.split()
# Converts each token into lowercase
for i in range(len(tokens)):
tokens[i] = tokens[i].lower()
for words in tokens:
comment_words = comment_words + words+' '
wordcloud = WordCloud(width = 800, height = 800,
background_color ='white',
stopwords = stopwords,
min_font_size = 10).generate(comment_words)
# plot the WordCloud image
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
from prettytable import PrettyTable
table = PrettyTable()
table.field_names = ["Vectorizer", "Hyper Parameter", "AUC"]
optimal=[10,500]
table.add_row(["TFIDF", optimal, test_auc_tfidf])
table.add_row(["AVG_W2V", optimal, test_auc_avg_w2v])
table.add_row(["TFIDF_W2V", optimal, test_auc_tfidf_w2v])
table.add_row(["TFIDF FEATURE_IMP", optimal, test_auc_feature_imp])
print(table)